install.packages('corpcor')
install.packages('mctest')
require(mgcv)
require(corpcor)
require(mctest)


#Importing datasets
all_sellerdata  = read.csv(file="C:\\Users\\f00456n\\Documents\\Amazon Price Dynamics\\Data Scraped\\New Data\\Modeling data sets\\Luggage\\finalnumsellers_modeldata1_luggage.csv", header=TRUE, sep=",")

modeldata = all_sellerdata[4:138,]

modeldata[is.na(modeldata)] = 0

attach(modeldata)


#Olympia : Number of sellers

data_numsellers_oly = data.frame(cbind(decr10per_3p_oly,
                                       decr20per_3p_oly,
                                       decr5per_3p_oly,
                                       incr10per_3p_oly,
                                       incr20per_3p_oly,
                                       incr5per_3p_oly,
                                       buyboxprice_olympia_lag1,
                                       nonbbox3pprice_oly_lag1,
                                       oly_bottombrandlag1,         
                                       oly_topbrandlag1,
                                       priceAmzn_oly_lag1,
                                       incr10per_amzn_oly,
                                       decr5per_amzn_oly,
                                       weekend,
                                       #more additional variables - in absence of review text
                                       Answered_Questionsolympia_lag1,
                                       Product_reviews_olympia_lag1,
                                       prodstar_olympia_lag1,
                                       salerank_olympia_lag1,
                                       salerank_subcat_olympia_lag1,
                                       num_olympiaused3p_lag1,
                                       num_olympiausedamzn_lag1,
                                       seasonal_sale))


#VARIANCE CHECK
variance_X = rep(0,ncol(data_numsellers_oly))

for (m in (1:ncol(data_numsellers_oly)))
{ variance_X[m] = var(data_numsellers_oly[,m]) }

nonzero_var = ncol(data_numsellers_oly)- sum((variance_X == 0)[]*1) #number of variables with non-zero variance

#Retaining variables with non-zero variance

trainX_retained = matrix(0,nrow(data_numsellers_oly),nonzero_var)

trainX_retained = data_numsellers_oly[,(variance_X != 0)]


data_numsellers_oly = trainX_retained

gam_numsellers_oly = gam(NUM_SELLERS_OLY ~ Answered_Questionsolympia_lag1	+
                           #buyboxprice_olympia_lag1	+
                           nonbbox3pprice_oly_lag1 +
                           priceAmzn_oly_lag1+
                           decr10per_3p_oly	+
                           incr10per_amzn_oly +
                           decr5per_amzn_oly +
                          # decr20per_3p_oly	+
                          # decr5per_3p_oly	+
                           #incr10per_3p_oly	+
                           #incr20per_3p_oly	+
                     #     incr5per_3p_oly	+
                            #oly_topbrandlag1	+
                            salerank_subcat_olympia_lag1	+
                           #Y_clust2_olympia_lag1	+
                          # oly_bottombrandlag1	+
                           Product_reviews_olympia_lag1	+
                           salerank_olympia_lag1	+
                           weekend	+
                           seasonal_sale, family= gaussian(link ='identity') ,data=data_numsellers_oly,method="REML",optimizer=c("outer","newton"), fit = TRUE)


summary(gam_numsellers_oly)

#Rockland - Number of sellers
data_numsellers_rl = data.frame(cbind( decr10per_3p_rcklnd,
                                       decr20per_3p_rcklnd,
                                       decr5per_3p_rcklnd,
                                       incr10per_3p_rcklnd,
                                       incr20per_3p_rcklnd,
                                       incr5per_3p_rcklnd,
                                       incr10per_amzn_rcklnd,
                                       buyboxprice_rcklnd_lag1,
                                       nonbbox3pprice_rcklnd_lag1,
                                       priceclust2_rcklnd_lag1,
                                       priceAmzn_rcklnd_lag1,
                                       rcklnd_bottombrandlag1,
                                       rcklnd_topbrandlag1,
                                       weekend,
                                       #more additional variables - in absence of review text
                                       Answered_Questionsrcklnd_lag1,
                                       Product_reviews_rcklnd_lag1,
                                       prodstar_rcklnd_lag1,
                                       salerank_rcklnd_lag1,
                                       salerank_subcat_rcklnd_lag1,
                                       num_rcklndused3p_lag1,
                                       num_rcklndusedamzn_lag1,
                                       seasonal_sale))
                         

#VARIANCE CHECK
variance_X = rep(0,ncol(data_numsellers_rl))

for (m in (1:ncol(data_numsellers_rl)))
{ variance_X[m] = var(data_numsellers_rl[,m]) }

nonzero_var = ncol(data_numsellers_rl)- sum((variance_X == 0)[]*1) #number of variables with non-zero variance

#Retaining variables with non-zero variance

trainX_retained = matrix(0,nrow(data_numsellers_rl),nonzero_var)

trainX_retained = data_numsellers_rl[,(variance_X != 0)]


data_numsellers_rl = trainX_retained

gam_numsellers_rl = gam(NUM_SELLERS_RL ~ 
                          Answered_Questionsrcklnd_lag1	+
                          buyboxprice_rcklnd_lag1	+
                          nonbbox3pprice_rcklnd_lag1 +
                          priceclust2_rcklnd_lag1 +
                          prodstar_rcklnd_lag1	+
                         # incr10per_amzn_rcklnd+
                       #   priceAmzn_rcklnd_lag1+
                         #rcklnd_bottombrandlag1	+
                          salerank_subcat_rcklnd_lag1	+
                         #decr5per_3p_rcklnd	+
                          Product_reviews_rcklnd_lag1	+
                          salerank_rcklnd_lag1	+
                          weekend	+
                          seasonal_sale, family= gaussian(link ='identity') ,data=data_numsellers_rl,method="REML",optimizer=c("outer","newton"), fit = TRUE)


summary(gam_numsellers_rl)


#Samsonite: Number of sellers
data_numsellers_sam = data.frame(cbind(decr10per_3p_sam,
                                      decr20per_3p_sam,
                                      decr5per_3p_sam,
                                      incr10per_3p_sam,
                                      incr20per_3p_sam,
                                      incr5per_3p_sam,
                                      decr10per_amzn_sam,
                                      decr20per_amzn_sam,
                                      decr5per_amzn_sam,
                                      incr10per_amzn_sam,
                                      incr20per_amzn_sam,
                                      incr5per_amzn_sam,
                                      buyboxprice_smsnite_lag1,
                                      nonbbox3pprice_sam_lag1,
                                      priceAmzn_sam_lag1,
                                      smsnte_bottombrandlag1,
                                      smsnte_topbrandlag1,
                                      weekend,
                                      #more additional variables - in absence of review text
                                      Answered_Questionssmsnite_lag1,
                                      Product_reviews_smsnite_lag1,
                                      salerank_smsnite_lag1,
                                      salerank_subcat_smsnite_lag1,
                                      num_smsniteused3p_lag1,
                                      num_smsniteusedamzn_lag1,
                                      seasonal_sale))

#VARIANCE CHECK
variance_X = rep(0,ncol(data_numsellers_sam))

for (m in (1:ncol(data_numsellers_sam)))
{ variance_X[m] = var(data_numsellers_sam[,m]) }

nonzero_var = ncol(data_numsellers_sam)- sum((variance_X == 0)[]*1) #number of variables with non-zero variance

#Retaining variables with non-zero variance

trainX_retained = matrix(0,nrow(data_numsellers_sam),nonzero_var)

trainX_retained = data_numsellers_sam[,(variance_X != 0)]


data_numsellers_sam = trainX_retained

gam_numsellers_sam = gam(NUM_SELLERS_SAM ~ Answered_Questionssmsnite_lag1	+
                           buyboxprice_smsnite_lag1	+
                           nonbbox3pprice_sam_lag1 +
                           priceAmzn_sam_lag1 + 
                           #decr10per_3p_sam	+
                          # decr10per_amzn_sam	+
                           #decr5per_3p_sam	+
                          # decr20per_3p_sam	+
                           #decr5per_amzn_sam	+
                          # decr20per_amzn_sam	+
                           #incr20per_3p_sam	+
                           incr5per_amzn_sam	+
                           incr10per_amzn_sam	+
                          # incr5per_3p_sam	+
                           Product_reviews_smsnite_lag1	+
                           salerank_subcat_smsnite_lag1	+
                           incr10per_3p_sam	+
                           incr20per_amzn_sam	+
                           weekend	+
                           seasonal_sale, family= gaussian(link ='identity') ,data=data_numsellers_sam,method="REML",optimizer=c("outer","newton"), fit = TRUE)


summary(gam_numsellers_sam)

#Travelers Choice: Number of sellers

data_numsellers_tc = data.frame(cbind( decr10per_3p_tc,
                                       decr20per_3p_tc,
                                       decr5per_3p_tc,
                                       decr10per_amzn_tc,
                                       decr20per_amzn_tc,
                                       decr5per_amzn_tc,
                                       incr10per_amzn_tc,
                                       incr20per_amzn_tc,
                                       incr5per_amzn_tc,
                                       incr10per_3p_tc,
                                       incr20per_3p_tc,
                                       incr5per_3p_tc,
                                       buyboxprice_tc_lag1,
                                       nonbbox3pprice_tc_lag1,
                                       priceAmzn_tc_lag1,
                                       priceclust2_tc_lag1,
                                       tc_bottombrandlag1,
                                       tc_topbrandlag1,
                                       weekend,
                                       #more additional variables - in absence of review text
                                       Answered_Questionstc_lag1,
                                       Product_reviews_tc_lag1,
                                       prodstar_tc_lag1,
                                       salerank_tc_lag1,
                                       salerank_subcat_tc_lag1,
                                       num_tcused3p_lag1,
                                       num_tcusedamzn_lag1,
                                       seasonal_sale))

#VARIANCE CHECK
variance_X = rep(0,ncol(data_numsellers_tc))

for (m in (1:ncol(data_numsellers_tc)))
{ variance_X[m] = var(data_numsellers_tc[,m]) }

nonzero_var = ncol(data_numsellers_tc)- sum((variance_X == 0)[]*1) #number of variables with non-zero variance

#Retaining variables with non-zero variance

trainX_retained = matrix(0,nrow(data_numsellers_tc),nonzero_var)

trainX_retained = data_numsellers_tc[,(variance_X != 0)]


data_numsellers_tc = trainX_retained
gam_numsellers_tc = gam(NUM_SELLERS_TC ~ Answered_Questionstc_lag1 +
                          #buyboxprice_tc_lag1 + 
                          priceAmzn_tc_lag1 +
                          nonbbox3pprice_tc_lag1 + priceclust2_tc_lag1 + incr20per_3p_tc + 
                          prodstar_tc_lag1 + incr5per_3p_tc + Product_reviews_tc_lag1 + 
                          weekend + incr10per_3p_tc + seasonal_sale
                        
                          , family= gaussian(link ='identity') ,data=data_numsellers_tc,method="REML",optimizer=c("outer","newton"), fit = TRUE)


summary(gam_numsellers_tc)

                        
                          
                          

                        
                         